../../posts - Data Summaries

#Social media data and summarise ()

tweets <- readRDS("ncod_tweets.rds")
favourite_summary <- summarise(tweets,
                           mean_favs = mean(favorite_count),
                           median_favs = median(favorite_count),
                           min_favs = min(favorite_count),
                           max_favs = max(favorite_count))

tweet_summary <- tweets %>%
  summarise(mean_favs = mean(favorite_count),
            median_favs = quantile(favorite_count, .5),
            n = n(),
            min_date = min(created_at),
            max_date = max(created_at))

glimpse(tweet_summary)

Rows: 1
Columns: 5
$ mean_favs   <dbl> 29.71732
$ median_favs <dbl> 3
$ n           <int> 28626
$ min_date    <dttm> 2021-10-10 00:10:02
$ max_date    <dttm> 2021-10-12 20:12:27

GGPLOT of favorite_count

ggplot(tweets, aes(x = favorite_count)) +
  geom_histogram(bins = 25) +
  scale_x_continuous(trans = "pseudo_log", #it goes up in intervals of magnitude and it helps see how much is between each interval 
                     breaks = c(0, 1, 10, 100, 1000, 10000))

tweets$source[2] #displays the second value in the row

[1] "Twitter for iPhone"

# %>% this is a pipe operator and it can be used to send output from one function into another 
tweet_summary <- tweets %>% # start with the object tweets and then
  summarise(mean_favs = mean(favorite_count), #summarise it
            median_favs = median(favorite_count))

tweet_summary <- tweets %>%
  summarise(mean_favs = mean(favorite_count),
            median_favs = quantile(favorite_count, .5),
            n = n(),
            min_date = min(created_at),
            max_date = max(created_at))

glimpse(tweet_summary)

Rows: 1
Columns: 5
$ mean_favs   <dbl> 29.71732
$ median_favs <dbl> 3
$ n           <int> 28626
$ min_date    <dttm> 2021-10-10 00:10:02
$ max_date    <dttm> 2021-10-12 20:12:27

tweet_summary$mean_favs #The $ operator

[1] 29.71732

#inline coding is also very helpful when it comes to writing paper.
#You use `r' 
date_from <- tweet_summary$min_date %>% 
  format("%d %B, %Y")
date_to <- tweet_summary$max_date %>% 
  format("%d %B, %Y")

There were 28626 tweets between 10 October, 2021 and 12 October, 2021.

verified <- 
  tweets %>% # Start with the original dataset; and then
  group_by(verified) %>% # group it; and then
  summarise(count = n(), # summarise it by those groups
            mean_favs = mean(favorite_count),
            mean_retweets = mean(retweet_count)) %>%
  ungroup()

verified

# A tibble: 2 × 4
  verified count mean_favs mean_retweets
  <lgl>    <int>     <dbl>         <dbl>
1 FALSE    26676      18.4          1.83
2 TRUE      1950     184.          21.5

most_fav <- tweets %>%
  group_by(is_quote) %>%
  filter(favorite_count == max(favorite_count)) %>%
  sample_n(size = 1) %>%
  ungroup()

#Inline coding 2

tweets_per_user <- tweets %>%
  count(screen_name, sort = TRUE)

head(tweets_per_user)

# A tibble: 6 × 2
  screen_name         n
  <chr>           <int>
1 interest_outfit    35
2 LeoShir2           33
3 NRArchway          32
4 dr_stack           32
5 bhavna_95          25
6 WipeHomophobia     23

unique_users <- nrow(tweets_per_user)
most_prolific <- slice(tweets_per_user, 1) %>% 
  pull(screen_name)
most_prolific_n <- slice(tweets_per_user, 1) %>% 
  pull(n)

There were 25189 unique accounts tweeting about #NationalComingOutDay. interest_outfit was the most prolific tweeter, with 35 tweets.

#Extra challenge problem

ny_data <- readr::read_csv("New_York_City_Leading_Causes_of_Death.csv")

Rows: 1094 Columns: 7
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (6): Leading Cause, Sex, Race Ethnicity, Deaths, Death Rate, Age Adjuste...
dbl (1): Year

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

corrected_nydata <-cols(
  Year = col_double(),
  `Leading Cause` = col_character(),
  Sex = col_character(),
  `Race Ethnicity` = col_character(),
  Deaths = col_double(),
  `Death Rate` = col_number(),
  `Age Adjusted Death Rate` = col_number()
)
  
 ny_data <- readr::read_csv("New_York_City_Leading_Causes_of_Death.csv",
                             col_types = corrected_nydata,
                              na = "."
                              )

 summary_nydata <- ny_data %>%
   group_by(`Leading Cause`) %>%
   summarise(mean_death = mean(Deaths, na.rm = TRUE)) %>%
   ggplot(aes(x=`Leading Cause`, y=mean_death)) +
   geom_point(na.rm = TRUE)+
   theme(axis.text.x = element_text(angle = 23, vjust = 1, hjust = 1))+
   scale_y_continuous(name="mean_death", limits = c(0, 1500, breaks= seq(0,1500,100)))


summary_nydata